
acs <- read.csv("Data/Raw data/ACS_SchoolDistricts_2010-2021.csv",header = TRUE)
acs <- cbind(LEAID = sprintf("%07d",acs$GEOID),acs)
acs$GEOID <- NULL

# Interpolate for missing data within years
a <- merge(df,acs,by = c("year","LEAID"),all.x = TRUE)
a <- a[is.na(a$population),c("LEAID","year")]
a <- unique(a)

# For each missing values, get closest year
for(i in 1:nrow(a)){
  
  b <- acs[acs$LEAID == a$LEAID[i],]
  b <- b[!is.na(b$year),]
  # Select closest year from available data
  b$diff <- abs(b$year - a$year[i])
  b <- b[b$diff == min(b$diff),]
  b$diff <- NULL
  
  if(nrow(b) !=1){#If the two years around missing data
    b <- as.data.frame(cbind(b$LEAID[1],t(colMeans(b[,-1]))))
    colnames(b) <- colnames(acs)
  }
  b$year <- a$year[i]
  acs <- rbind(acs,b) #add line to acs main dataset   
}
rm(a,b,i)

#Keep variables
acs <- acs[,c("LEAID","year","housing_units","population_votingage","income_mean","realestate_taxes_median")]

df <- merge(df,acs,by = c( "LEAID","year"))

# Convert to numeric
df$housing_units <- as.numeric(df$housing_units)
df$population_votingage <- as.numeric(df$population_votingage)
df$realestate_taxes_median <- as.numeric(df$realestate_taxes_median)

###########################################################################
# Get homeowernship pre-TCJA 
acs <- read.csv("Data/Raw data/ACS_SchoolDistricts_2010-2021.csv",header = TRUE)
acs <- cbind(LEAID = sprintf("%07d",acs$GEOID),acs)
acs$GEOID <- NULL
acs <- acs[acs$year ==2017,]
acs <- acs[,c("LEAID","ownership")]

# For school districs 0605670 and 0642990, homeownership rates is not reported in 2017. We use values in 2015 instead
acs <- rbind(acs,as.data.frame(x = list(LEAID = c("0605670","0642990"),ownership = c(0.7232587,0.6844522))))

colnames(acs) <- c("LEAID","ownership_2017")
df <- merge(df,acs,by = "LEAID")
rm(acs)

###########################################################################
# Get housing units in 2018 otherwise salt change is time-variant
hu <- read.csv("Data/Raw data/ACS_SchoolDistricts_2010-2021.csv",header = TRUE)
hu <- cbind(LEAID = sprintf("%07d",hu$GEOID),hu)
hu$GEOID <- NULL
hu <- hu[hu$year ==2018,]
hu <- hu[,c("LEAID","housing_units")]
colnames(hu)[2] <- "housing_units18"

df <- merge(df,hu,by = "LEAID",all.x = TRUE)
df$housing_units18 <- ifelse(is.na(df$housing_units18),df$housing_units,df$housing_units18)
rm(hu)


###
# Construct new variables using ACS data
df$bond_ph <- (df$bond_amount) / df$housing_units
df$turnout <- as.numeric(df$vote_total) / df$population_votingage
df$turnout <- Winsorize(df$turnout,minval = 0,maxval = 1)

df$salt_change_ph <- df$salt_amt_change / df$housing_units18
df$wasted_salt_ph <- df$wasted_salt / df$housing_units18
df$cost_bond_ph <- df$cost_bond / df$housing_units


df$levy_increase <- ifelse(!is.na(df$levy_parcel),df$levy_parcel / df$realestate_taxes_median*100,NA )
